# coding=utf-8

import re
import html

from fetcher import Fetcher
from replacer import replace

re_list = (
           # 总提取
            [
                (r'^.*?'
                 r'(?:<div class="card-summary-content">|'
                 r'class="lemma-main-content">)'
                 r'(.*?)'
                 r'(?:<div class="open-tag-title">|'
                 r'<span class="bacb-title">词条图册</span>|'
                 r'<dl id="viewRefer")'
                 ),
                re.S|re.I,
                r'\1'
            ],
           
           # 基本信息
            [
                (r'^(.*?)<div class="baseInfoWrap"',
                 r'.*?</script>(.*)$'
                 ),
                re.S,
                r'\1\2'
            ],
           
            # 目录
            [
                (r'^(.*?)<h2><span>目录</span></h2>',
                 r'.*?</dd>\s*</div>\s*</dl>(.*)$'
                 ),
                re.S,
                r'\1\2'
            ],
           
            # 编辑
            [
                r'title="编辑本段">编辑</a>',
                0,
                r'></a>'
            ],
           
            # 每段标题前的数字
            [
                r'(<span class="headline-1-index">)',
                0,
                r'\n\1'
            ],   
           
            # 每段标题
            [
                r'(<span class="headline-content">.*?</span>)',
                re.S,
                r'\1\n'
            ],   
           
            # 主词条
            [
                r'<span>主词条: </span>.*?</div>',
                re.S,
                r''
            ],     
           
           # table
            [
                r'<table log-set-param="table_view".*?</table>',
                re.S,
                r'[省略一张表格]'
            ],
           
            # 图片下注释
            [
                r'<span class="description">.*?</span>',
                re.S,
                r''
            ],      
           
            # javascript
            [
                r'<script.*?</script>',
                re.S,
                r''
            ],            

            # style
            [
                r'<style.*?</style>',
                re.S,
                r''
            ],                
           
           #===================================
           
           # html标签
            [
                r'''<(?:[^"'>]|"[^"]*"|'[^']*')*>''',
                0,
                r''
            ],
           
            # 注释，ie:[12]
            [
                r'\[[\d-]*\]',
                0,
                r''
            ],
           
            # CR,\r
            [
                r'\x0D+',
                0,
                ''
            ],
           
            # 大量空格变两个
            [
                r' {3,}',
                0,
                r'  '
            ],
           
            # 大量换行变两个
            [
                r'\n{3,}',
                0,
                r'\n\n'
            ],
            )  
    
def main():
    url = input('输入百度百科的页面网址:').strip()
    
    # 下载、解码
    f = Fetcher()
    string = f.fetch_url(url)
    if not string:
        print('下载失败')
        return
    
    try:
        string = string.decode('utf-8')
    except:
        print('解码失败')
        return
        
    # 文件名
    fn = re.search(r'<title>(.*?)_百度百科</title>', string).group(1)
    
    # 正则列表替换
    string = replace(string, re_list)
    string = html.unescape(string).strip()
    
    # 写入文件
    with open(fn + '.txt', 'w', encoding='utf-8-sig') as f:
        f.write(string)

if __name__ == '__main__':
    main()